library(gutenbergr)
library(tidyverse)
all_books_df <- gutenberg_download(c(219,844,43,4300),
mirror="http://mirrors.xmission.com/gutenberg/")
library(tidytext)
all_tokens <- all_books_df %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text)
all_tokens %>% count(word, sort = TRUE)
all_tidybook <- all_tokens %>% anti_join(get_stopwords(source = "smart"),by = "word")
all_tidybook %>% count(word, sort = TRUE)
library(gutenbergr)
library(dplyr)
library(tidytext)
heartofdark_tidy <- gutenberg_download(219, mirror="http://mirrors.xmission.com/gutenberg/") %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
heartofdark_tidy %>% count(word,sort = TRUE)
earnest_tidy <- gutenberg_download(844,mirror="http://mirrors.xmission.com/gutenberg/") %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
earnest_tidy %>% count(word,sort = TRUE)
jekyllandhyde_tidy <- gutenberg_download(43,mirror="http://mirrors.xmission.com/gutenberg/") %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
jekyllandhyde_tidy %>% count(word,sort = TRUE)
ulysses_tidy <- gutenberg_download(4300,mirror="http://mirrors.xmission.com/gutenberg/") %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
ulysses_tidy %>% count(word,sort = TRUE)
author_bind <- bind_rows(mutate(tidy_joseph_conrad, author = "Joseph Conrad"),
mutate(tidy_oscar_wilde, author = "Oscar Wilde"),
mutate(tidy_louis_stevenson, author = "Robert Louis Stevenson"),
mutate(tidy_james_joyce, author = "James Joyce"))
frequency <- author_bind %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>% #Drop n
spread(author, proportion) %>% #Reshape long dataset into wide
gather(author, proportion, `Joseph Conrad`:`Oscar Wilde`:`Robert Louis Stevenson`)
names(frequency)
[1] "word" "James Joyce" "author" "proportion"
tabyl(frequency$author)
head(frequency)
The plots for all three approaches are comparable, with NRC having the largest positive skew. AFINN and Bing are nearly identical in their ability to distinguish between positive and negative words, while Bing has more negative spikes and longer continuous positive and negative word lengths.
The most positive emotion is shown by NRC, while the highest negative sentiment is shown by AFINN.
In NRC and Bing, we’ll examine at the contribution of positive and negative words because they have the most volatility.
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
get_sentiments("bing") %>%
count(sentiment)
Negative terms are more common than positive ones in both cases, but the ratio of negative to positive words is larger in Bing, which makes sense given the plots we observed.
Finally, checking the most common positive and negative words
#Taking combined tidy data set of all books created in the beginning; all_tidybook
#Adding title column to tidy data
level_key <- c(`219` = "Heart of Darkness", `844` = "Importance of Being Earnest",
`43` = "Dr. Jekyll and Mr. Hyde", `4300` = "Ulysses")
all_tidybook <- all_tidybook %>%
mutate(title = recode(gutenberg_id, !!!level_key)) %>%
subset(select=c(1,4,2,3))
#Calculate tf_idf for all books
all_book_tf_idf <- all_tidybook %>%
count(title, word, sort = TRUE) %>%
bind_tf_idf(word, title, n)
all_book_tf_idf
head(all_book_tf_idf %>% arrange(desc(tf_idf)),10)
all_book_tf_idf %>%
group_by(title) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)